Lab 8: Hyper Parameter Tuning

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tune         1.2.1
✔ infer        1.0.7     ✔ workflows    1.1.4
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.2.1     ✔ yardstick    1.3.2
✔ recipes      1.1.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Dig deeper into tidy modeling with R at https://www.tmwr.org
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
library(visdat)
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(patchwork)

Data Cleaning/Import/Tidy/Transform

root  <- 'https://gdex.ucar.edu/dataset/camels/file'

# download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
    #        'data/camels_attributes_v2.0.pdf')

types <- c("clim", "geol", "soil", "topo", "vege", "hydro")

remote_files  <- glue('{root}/camels_{types}.txt')
local_files   <- glue('data/camels_{types}.txt')

#walk2(remote_files, local_files, download.file, quiet = TRUE)

camels <- purrr::map(local_files, ~read_delim(.x, show_col_types = FALSE))

camels <- power_full_join(camels ,by = 'gauge_id')

summary(camels)
   gauge_id             p_mean          pet_mean     p_seasonality     
 Length:671         Min.   :0.6446   Min.   :1.899   Min.   :-1.43546  
 Class :character   1st Qu.:2.3731   1st Qu.:2.335   1st Qu.:-0.26352  
 Mode  :character   Median :3.2295   Median :2.688   Median : 0.08093  
                    Mean   :3.2577   Mean   :2.787   Mean   :-0.04128  
                    3rd Qu.:3.7835   3rd Qu.:3.146   3rd Qu.: 0.22399  
                    Max.   :8.9369   Max.   :4.744   Max.   : 0.92202  
                                                                       
   frac_snow          aridity       high_prec_freq  high_prec_dur  
 Min.   :0.00000   Min.   :0.2203   Min.   : 7.90   Min.   :1.075  
 1st Qu.:0.03514   1st Qu.:0.6957   1st Qu.:18.50   1st Qu.:1.209  
 Median :0.09793   Median :0.8551   Median :22.00   Median :1.282  
 Mean   :0.17760   Mean   :1.0565   Mean   :20.93   Mean   :1.350  
 3rd Qu.:0.22306   3rd Qu.:1.2673   3rd Qu.:24.23   3rd Qu.:1.440  
 Max.   :0.90633   Max.   :5.2079   Max.   :32.70   Max.   :2.091  
                                                                   
 high_prec_timing   low_prec_freq    low_prec_dur    low_prec_timing   
 Length:671         Min.   :169.9   Min.   : 2.789   Length:671        
 Class :character   1st Qu.:232.7   1st Qu.: 4.241   Class :character  
 Mode  :character   Median :255.8   Median : 4.950   Mode  :character  
                    Mean   :254.6   Mean   : 5.954                     
                    3rd Qu.:278.9   3rd Qu.: 6.702                     
                    Max.   :348.7   Max.   :36.513                     
                                                                       
 geol_1st_class     glim_1st_class_frac geol_2nd_class     glim_2nd_class_frac
 Length:671         Min.   :0.2967      Length:671         Min.   :0.000000   
 Class :character   1st Qu.:0.6083      Class :character   1st Qu.:0.002894   
 Mode  :character   Median :0.8294      Mode  :character   Median :0.136540   
                    Mean   :0.7855                         Mean   :0.155426   
                    3rd Qu.:0.9971                         3rd Qu.:0.266373   
                    Max.   :1.0000                         Max.   :0.489930   
                                                                              
 carbonate_rocks_frac geol_porostiy     geol_permeability soil_depth_pelletier
 Min.   :0.00000      Min.   :0.01000   Min.   :-16.50    Min.   : 0.2667     
 1st Qu.:0.00000      1st Qu.:0.06767   1st Qu.:-14.77    1st Qu.: 1.0000     
 Median :0.00000      Median :0.13190   Median :-13.96    Median : 1.2283     
 Mean   :0.11874      Mean   :0.12637   Mean   :-13.89    Mean   :10.8728     
 3rd Qu.:0.04333      3rd Qu.:0.18623   3rd Qu.:-13.00    3rd Qu.:12.8894     
 Max.   :1.00000      Max.   :0.28000   Max.   :-10.90    Max.   :50.0000     
                      NA's   :3                                               
 soil_depth_statsgo soil_porosity    soil_conductivity max_water_content
 Min.   :0.3999     Min.   :0.3733   Min.   : 0.4469   Min.   :0.0866   
 1st Qu.:1.1054     1st Qu.:0.4309   1st Qu.: 0.9321   1st Qu.:0.4293   
 Median :1.4577     Median :0.4422   Median : 1.3477   Median :0.5579   
 Mean   :1.2932     Mean   :0.4426   Mean   : 1.7405   Mean   :0.5280   
 3rd Qu.:1.5000     3rd Qu.:0.4554   3rd Qu.: 1.9323   3rd Qu.:0.6450   
 Max.   :1.5000     Max.   :0.6800   Max.   :13.9557   Max.   :1.0520   
                                                                        
   sand_frac        silt_frac        clay_frac        water_frac     
 Min.   : 8.184   Min.   : 2.985   Min.   : 1.846   Min.   : 0.0000  
 1st Qu.:25.437   1st Qu.:23.947   1st Qu.:13.999   1st Qu.: 0.0000  
 Median :35.269   Median :34.059   Median :18.663   Median : 0.0000  
 Mean   :36.468   Mean   :33.859   Mean   :19.886   Mean   : 0.1017  
 3rd Qu.:44.457   3rd Qu.:43.639   3rd Qu.:25.420   3rd Qu.: 0.0000  
 Max.   :91.976   Max.   :67.775   Max.   :50.354   Max.   :19.3545  
                                                                     
  organic_frac       other_frac       gauge_lat       gauge_lon      
 Min.   : 0.0000   Min.   : 0.000   Min.   :27.05   Min.   :-124.39  
 1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.:35.70   1st Qu.:-110.41  
 Median : 0.0000   Median : 1.309   Median :39.25   Median : -92.78  
 Mean   : 0.5918   Mean   : 9.825   Mean   :39.24   Mean   : -95.79  
 3rd Qu.: 0.0000   3rd Qu.:11.737   3rd Qu.:43.21   3rd Qu.: -81.77  
 Max.   :57.8631   Max.   :99.378   Max.   :48.82   Max.   : -67.94  
                                                                     
   elev_mean         slope_mean        area_gages2       area_geospa_fabric
 Min.   :  10.21   Min.   :  0.8222   Min.   :    4.03   Min.   :    4.1   
 1st Qu.: 249.67   1st Qu.:  7.4268   1st Qu.:  122.28   1st Qu.:  128.0   
 Median : 462.72   Median : 28.8016   Median :  329.68   Median :  340.7   
 Mean   : 759.42   Mean   : 46.1953   Mean   :  792.62   Mean   :  808.1   
 3rd Qu.: 928.88   3rd Qu.: 73.1695   3rd Qu.:  794.29   3rd Qu.:  804.5   
 Max.   :3571.18   Max.   :255.6884   Max.   :25791.04   Max.   :25817.8   
                                                                           
  frac_forest        lai_max          lai_diff         gvf_max      
 Min.   :0.0000   Min.   :0.3671   Min.   :0.1544   Min.   :0.1843  
 1st Qu.:0.2771   1st Qu.:1.8143   1st Qu.:1.1968   1st Qu.:0.6086  
 Median :0.8137   Median :3.3713   Median :2.3365   Median :0.7803  
 Mean   :0.6395   Mean   :3.2160   Mean   :2.4486   Mean   :0.7221  
 3rd Qu.:0.9724   3rd Qu.:4.6963   3rd Qu.:3.7574   3rd Qu.:0.8649  
 Max.   :1.0000   Max.   :5.5821   Max.   :4.8315   Max.   :0.9157  
                                                                    
    gvf_diff      dom_land_cover_frac dom_land_cover     root_depth_50   
 Min.   :0.0290   Min.   :0.3145      Length:671         Min.   :0.1200  
 1st Qu.:0.1883   1st Qu.:0.6511      Class :character   1st Qu.:0.1654  
 Median :0.3160   Median :0.8582      Mode  :character   Median :0.1800  
 Mean   :0.3227   Mean   :0.8100                         Mean   :0.1788  
 3rd Qu.:0.4627   3rd Qu.:0.9967                         3rd Qu.:0.1900  
 Max.   :0.6522   Max.   :1.0000                         Max.   :0.2500  
                                                         NA's   :24      
 root_depth_99       q_mean          runoff_ratio        slope_fdc     
 Min.   :1.500   Min.   :0.004553   Min.   :0.004238   Min.   :0.0000  
 1st Qu.:1.522   1st Qu.:0.632918   1st Qu.:0.242443   1st Qu.:0.8978  
 Median :1.800   Median :1.131818   Median :0.350664   Median :1.2829  
 Mean   :1.830   Mean   :1.493967   Mean   :0.387146   Mean   :1.2372  
 3rd Qu.:2.000   3rd Qu.:1.750901   3rd Qu.:0.506681   3rd Qu.:1.6306  
 Max.   :3.100   Max.   :9.688438   Max.   :1.362132   Max.   :2.4973  
 NA's   :24      NA's   :1          NA's   :1          NA's   :1       
 baseflow_index      stream_elas            q5                q95        
 Min.   :0.006858   Min.   :-0.6363   Min.   :0.000000   Min.   : 0.000  
 1st Qu.:0.397430   1st Qu.: 1.3177   1st Qu.:0.009155   1st Qu.: 2.066  
 Median :0.504923   Median : 1.7006   Median :0.081568   Median : 3.769  
 Mean   :0.491447   Mean   : 1.8322   Mean   :0.171803   Mean   : 5.057  
 3rd Qu.:0.600345   3rd Qu.: 2.2255   3rd Qu.:0.219522   3rd Qu.: 6.288  
 Max.   :0.977556   Max.   : 6.2405   Max.   :2.424938   Max.   :31.817  
                    NA's   :1         NA's   :1          NA's   :1       
  high_q_freq        high_q_dur       low_q_freq       low_q_dur     
 Min.   :  0.000   Min.   : 0.000   Min.   :  0.00   Min.   :  0.00  
 1st Qu.:  6.412   1st Qu.: 1.821   1st Qu.: 37.44   1st Qu.: 10.00  
 Median : 15.100   Median : 2.848   Median : 96.00   Median : 15.52  
 Mean   : 25.745   Mean   : 6.913   Mean   :107.62   Mean   : 22.28  
 3rd Qu.: 35.788   3rd Qu.: 7.554   3rd Qu.:162.14   3rd Qu.: 26.91  
 Max.   :172.800   Max.   :92.559   Max.   :356.80   Max.   :209.88  
 NA's   :1         NA's   :1        NA's   :1        NA's   :1       
  zero_q_freq         hfd_mean    
 Min.   :0.00000   Min.   :112.2  
 1st Qu.:0.00000   1st Qu.:160.2  
 Median :0.00000   Median :173.8  
 Mean   :0.03415   Mean   :182.5  
 3rd Qu.:0.00000   3rd Qu.:204.1  
 Max.   :0.96537   Max.   :287.8  
 NA's   :1         NA's   :1      
skimr::skim(camels)
Data summary
Name camels
Number of rows 671
Number of columns 58
_______________________
Column type frequency:
character 6
numeric 52
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
gauge_id 0 1.00 8 8 0 671 0
high_prec_timing 0 1.00 3 3 0 4 0
low_prec_timing 0 1.00 3 3 0 4 0
geol_1st_class 0 1.00 12 31 0 12 0
geol_2nd_class 138 0.79 12 31 0 13 0
dom_land_cover 0 1.00 12 38 0 12 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
p_mean 0 1.00 3.26 1.41 0.64 2.37 3.23 3.78 8.94 ▃▇▂▁▁
pet_mean 0 1.00 2.79 0.55 1.90 2.34 2.69 3.15 4.74 ▇▇▅▂▁
p_seasonality 0 1.00 -0.04 0.53 -1.44 -0.26 0.08 0.22 0.92 ▁▂▃▇▂
frac_snow 0 1.00 0.18 0.20 0.00 0.04 0.10 0.22 0.91 ▇▂▁▁▁
aridity 0 1.00 1.06 0.62 0.22 0.70 0.86 1.27 5.21 ▇▂▁▁▁
high_prec_freq 0 1.00 20.93 4.55 7.90 18.50 22.00 24.23 32.70 ▂▃▇▇▁
high_prec_dur 0 1.00 1.35 0.19 1.08 1.21 1.28 1.44 2.09 ▇▅▂▁▁
low_prec_freq 0 1.00 254.65 35.12 169.90 232.70 255.85 278.92 348.70 ▂▅▇▅▁
low_prec_dur 0 1.00 5.95 3.20 2.79 4.24 4.95 6.70 36.51 ▇▁▁▁▁
glim_1st_class_frac 0 1.00 0.79 0.20 0.30 0.61 0.83 1.00 1.00 ▁▃▃▃▇
glim_2nd_class_frac 0 1.00 0.16 0.14 0.00 0.00 0.14 0.27 0.49 ▇▃▃▂▁
carbonate_rocks_frac 0 1.00 0.12 0.26 0.00 0.00 0.00 0.04 1.00 ▇▁▁▁▁
geol_porostiy 3 1.00 0.13 0.07 0.01 0.07 0.13 0.19 0.28 ▇▆▇▇▂
geol_permeability 0 1.00 -13.89 1.18 -16.50 -14.77 -13.96 -13.00 -10.90 ▂▅▇▅▂
soil_depth_pelletier 0 1.00 10.87 16.24 0.27 1.00 1.23 12.89 50.00 ▇▁▁▁▁
soil_depth_statsgo 0 1.00 1.29 0.27 0.40 1.11 1.46 1.50 1.50 ▁▁▂▂▇
soil_porosity 0 1.00 0.44 0.02 0.37 0.43 0.44 0.46 0.68 ▃▇▁▁▁
soil_conductivity 0 1.00 1.74 1.52 0.45 0.93 1.35 1.93 13.96 ▇▁▁▁▁
max_water_content 0 1.00 0.53 0.15 0.09 0.43 0.56 0.64 1.05 ▁▅▇▃▁
sand_frac 0 1.00 36.47 15.63 8.18 25.44 35.27 44.46 91.98 ▅▇▅▁▁
silt_frac 0 1.00 33.86 13.25 2.99 23.95 34.06 43.64 67.77 ▂▆▇▆▁
clay_frac 0 1.00 19.89 9.32 1.85 14.00 18.66 25.42 50.35 ▃▇▅▂▁
water_frac 0 1.00 0.10 0.94 0.00 0.00 0.00 0.00 19.35 ▇▁▁▁▁
organic_frac 0 1.00 0.59 3.84 0.00 0.00 0.00 0.00 57.86 ▇▁▁▁▁
other_frac 0 1.00 9.82 16.83 0.00 0.00 1.31 11.74 99.38 ▇▁▁▁▁
gauge_lat 0 1.00 39.24 5.21 27.05 35.70 39.25 43.21 48.82 ▂▃▇▆▅
gauge_lon 0 1.00 -95.79 16.21 -124.39 -110.41 -92.78 -81.77 -67.94 ▆▃▇▇▅
elev_mean 0 1.00 759.42 786.00 10.21 249.67 462.72 928.88 3571.18 ▇▂▁▁▁
slope_mean 0 1.00 46.20 47.12 0.82 7.43 28.80 73.17 255.69 ▇▂▂▁▁
area_gages2 0 1.00 792.62 1701.95 4.03 122.28 329.68 794.30 25791.04 ▇▁▁▁▁
area_geospa_fabric 0 1.00 808.08 1709.85 4.10 127.98 340.70 804.50 25817.78 ▇▁▁▁▁
frac_forest 0 1.00 0.64 0.37 0.00 0.28 0.81 0.97 1.00 ▃▁▁▂▇
lai_max 0 1.00 3.22 1.52 0.37 1.81 3.37 4.70 5.58 ▅▆▃▅▇
lai_diff 0 1.00 2.45 1.33 0.15 1.20 2.34 3.76 4.83 ▇▇▇▆▇
gvf_max 0 1.00 0.72 0.17 0.18 0.61 0.78 0.86 0.92 ▁▁▂▃▇
gvf_diff 0 1.00 0.32 0.15 0.03 0.19 0.32 0.46 0.65 ▃▇▅▇▁
dom_land_cover_frac 0 1.00 0.81 0.18 0.31 0.65 0.86 1.00 1.00 ▁▂▃▃▇
root_depth_50 24 0.96 0.18 0.03 0.12 0.17 0.18 0.19 0.25 ▃▃▇▂▂
root_depth_99 24 0.96 1.83 0.30 1.50 1.52 1.80 2.00 3.10 ▇▃▂▁▁
q_mean 1 1.00 1.49 1.54 0.00 0.63 1.13 1.75 9.69 ▇▁▁▁▁
runoff_ratio 1 1.00 0.39 0.23 0.00 0.24 0.35 0.51 1.36 ▆▇▂▁▁
slope_fdc 1 1.00 1.24 0.51 0.00 0.90 1.28 1.63 2.50 ▂▅▇▇▁
baseflow_index 0 1.00 0.49 0.16 0.01 0.40 0.50 0.60 0.98 ▁▃▇▅▁
stream_elas 1 1.00 1.83 0.78 -0.64 1.32 1.70 2.23 6.24 ▁▇▃▁▁
q5 1 1.00 0.17 0.27 0.00 0.01 0.08 0.22 2.42 ▇▁▁▁▁
q95 1 1.00 5.06 4.94 0.00 2.07 3.77 6.29 31.82 ▇▂▁▁▁
high_q_freq 1 1.00 25.74 29.07 0.00 6.41 15.10 35.79 172.80 ▇▂▁▁▁
high_q_dur 1 1.00 6.91 10.07 0.00 1.82 2.85 7.55 92.56 ▇▁▁▁▁
low_q_freq 1 1.00 107.62 82.24 0.00 37.44 96.00 162.14 356.80 ▇▆▅▂▁
low_q_dur 1 1.00 22.28 21.66 0.00 10.00 15.52 26.91 209.88 ▇▁▁▁▁
zero_q_freq 1 1.00 0.03 0.11 0.00 0.00 0.00 0.00 0.97 ▇▁▁▁▁
hfd_mean 1 1.00 182.52 33.53 112.25 160.16 173.77 204.05 287.75 ▂▇▃▂▁

Data Splitting

set.seed(123)

split_lab8 <- initial_split(camels, prop = 0.8)
train_lab8 <- training(split_lab8)
test_lab8  <- testing(split_lab8)

rec8 = recipe(q_mean ~ low_prec_freq + p_mean, data = train_lab8) |>
 # step_rm(gauge_lat, gauge_lon) |>
  step_unknown(all_nominal_predictors()) |>
  step_dummy(all_nominal()) |>  
  step_scale(all_numeric_predictors()) |>  
  step_center(all_numeric_predictors()) |>
  step_naomit(all_predictors(), all_outcomes())

folds8 <- vfold_cv(train_lab8, v = 10)

Building Models

rf_model8 <- rand_forest() |> 
  set_engine("ranger") |> 
  set_mode("regression")

lm_model8 <- linear_reg() |> 
  set_engine("lm") |> 
  set_mode("regression")

xgb_model8 <- boost_tree() |> 
  set_engine("xgboost") |> 
  set_mode("regression")

models <- list(
  rf = rf_model8,
  lm = lm_model8,
  xgb = xgb_model8
)

rm(wf8)
Warning in rm(wf8): object 'wf8' not found
wf8 <- workflow_set(list(rec8), list(lm_model8, rf_model8, xgb_model8)) |>
  workflow_map('fit_resamples', resamples = folds8) 

autoplot(wf8)

Here we can see the r squared values for all three models. The model with the highest r squared value is the best fit for our data set, so in this case it is the random forest model and that is the one I will move ahead with. Using the regression mode allows us to predict the continuous outcome variable, like q_mean, from our multiple independent variables.

Model Tuning

rf_tune <- rand_forest(trees = tune(), min_n = tune()) |>
  set_engine("ranger") |>
  set_mode("regression")

wf_tune <-  workflow(rec8, rf_tune)

Check Tunable Values

dials <- extract_parameter_set_dials(wf_tune) 
dials$object
[[1]]
# Trees (quantitative)
Range: [1, 2000]

[[2]]
Minimal Node Size (quantitative)
Range: [2, 40]

Define the Search Space

my.grid <- dials |> 
  update(trees = trees(c(50, 500))) |>
  grid_latin_hypercube(size = 25)
Warning: `grid_latin_hypercube()` was deprecated in dials 1.3.0.
ℹ Please use `grid_space_filling()` instead.
range(my.grid$trees)
[1]  53 483
plotly::plot_ly(my.grid, 
               x = ~trees, 
               y = ~min_n)
No trace type specified:
  Based on info supplied, a 'scatter' trace seems appropriate.
  Read more about this trace type -> https://plotly.com/r/reference/#scatter
No scatter mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Tune the Model

model_params <-  tune_grid(
    wf_tune,
    resamples = folds8,
    grid = my.grid,
    metrics = metric_set(rmse, rsq, mae),
    control = control_grid(save_pred = TRUE)
  )

autoplot(model_params)

Check the skill of the tuned model

tuned_results <- tune_grid(
  wf_tune,
  resamples = folds8,
  grid = 25
)

metrics <- collect_metrics(tuned_results)

show_best(model_params, metric = "mae")
# A tibble: 5 × 8
  trees min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1   483    11 mae     standard   0.350    10  0.0188 Preprocessor1_Model25
2   125     8 mae     standard   0.351    10  0.0196 Preprocessor1_Model03
3   458    14 mae     standard   0.351    10  0.0189 Preprocessor1_Model19
4   418    16 mae     standard   0.352    10  0.0185 Preprocessor1_Model15
5    99     9 mae     standard   0.353    10  0.0210 Preprocessor1_Model18
hp_best <- select_best(model_params, metric = "mae")

When looking at the show_best tibble, the hyper parameter set that is best for this model is 205 for the trees hyper parameter and 15 for min_n. This hyper parameter has the lowest standard error and the highest mean, showing that this set will work best for my data.

Finalize your model

finalize <- finalize_workflow(wf_tune, hp_best)

Final Model Verification

final_fit <- last_fit(finalize, split = split_lab8)

collect_metrics(final_fit)
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard       0.498 Preprocessor1_Model1
2 rsq     standard       0.903 Preprocessor1_Model1

The final model performs better on the test data than it did on the training data. The final model has a root mean squared error of 0.497, which is the average difference between the predicted values from the model and the actual values. This is much lower than the rmse of the training data, which was about 0.55. The final model also has a higher r-squared value than the training data, with a value of 0.903 compared to 0.88 for the training data. The r-squared value of 0.9 means that about 90% of the variance in the dependent variable can be explained by the independent variable.

preds <- collect_predictions(final_fit)
ggplot(preds, aes(x = q_mean, y = .pred, color = .pred)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "darkblue") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  scale_color_viridis_c(option = "C") +
  labs(
    title = "Q Mean Predicted vs. Actual Values",
    x = "Actual Values",
    y = "Predicted Values",
    color = "Prediction"
  ) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Building a Map!

final_model_fit <- fit(finalize, data = camels)
predicted_data <- augment(final_model_fit, new_data = camels) |>
  mutate(residual = (.pred - q_mean^2))

map_pred <- ggplot(predicted_data, aes(x = gauge_lon, y = gauge_lat, color = .pred)) +
  geom_point(size = 1, alpha = 0.8) +
  coord_fixed() +
  scale_fill_viridis_c(option = "C") +
  labs(
    title = "Predicted Values",
    x = "Longitude",
    y = "Latitude",
    fill = "Prediction"
  ) +
  theme_minimal()

map_resid <- ggplot(predicted_data, aes(x = gauge_lon, y = gauge_lat, color = residual)) +
  geom_point(size = 1, alpha = 0.8) +
  coord_fixed() +
  scale_fill_viridis_c(option = "A") +
  labs(
    title = "Squared Residuals",
    x = "Longitude",
    y = "Latitude",
    fill = "Residual²"
  ) +
  theme_minimal()

map_pred / map_resid